{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import json\n",
    "import random\n",
    "import torch\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from transformers import AutoTokenizer, T5Config\n",
    "from malaya.torch_model.t5 import T5ForTokenClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('prepared.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "merged = list(itertools.chain(*(data['train_Y'] + data['test_Y'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['OTHER',\n",
       " 'ADJ',\n",
       " 'ADP',\n",
       " 'ADV',\n",
       " 'AUX',\n",
       " 'CCONJ',\n",
       " 'DET',\n",
       " 'NOUN',\n",
       " 'NUM',\n",
       " 'PART',\n",
       " 'PRON',\n",
       " 'PROPN',\n",
       " 'PUNCT',\n",
       " 'SCONJ',\n",
       " 'SYM',\n",
       " 'VERB',\n",
       " 'X']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = ['OTHER'] + sorted(set(merged))\n",
    "labels_tag = {i: no for no, i in enumerate(labels)}\n",
    "label_list = list(labels_tag.keys())\n",
    "label_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = T5Config.from_pretrained('mesolitica/translation-t5-tiny-standard-bahasa-cased')\n",
    "config.num_labels = len(labels_tag)\n",
    "config.vocab = labels_tag\n",
    "config.rev_vocab = {v: k for v, k in labels_tag.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of T5ForTokenClassification were not initialized from the model checkpoint at mesolitica/translation-t5-tiny-standard-bahasa-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = T5ForTokenClassification.from_pretrained('mesolitica/translation-t5-tiny-standard-bahasa-cased', config = config)\n",
    "_ = model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n",
      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('mesolitica/translation-t5-tiny-standard-bahasa-cased', add_prefix_space = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "400000"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = []\n",
    "for i in range(len(data['train_X'][:200000])):\n",
    "    if len(data['train_X'][i]) != len(data['train_Y'][i]):\n",
    "        continue\n",
    "        \n",
    "    train.append({\n",
    "        'tokens': data['train_X'][i],\n",
    "        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]\n",
    "    })\n",
    "    \n",
    "for i in range(len(data['train_X'][200000:400000])):\n",
    "    if len(data['train_X'][i]) != len(data['train_Y'][i]):\n",
    "        continue\n",
    "    \n",
    "    train.append({\n",
    "        'tokens': [t.lower() for t in data['train_X'][i]],\n",
    "        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]\n",
    "    })\n",
    "    \n",
    "random.shuffle(train)\n",
    "train = pd.DataFrame(train).to_dict(orient = 'list')\n",
    "len(train['tokens'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4000"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = []\n",
    "for i in range(len(data['test_X'][:2000])):\n",
    "    test.append({\n",
    "        'tokens': data['test_X'][i],\n",
    "        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]\n",
    "    })\n",
    "    \n",
    "for i in range(len(data['test_X'][2000:4000])):\n",
    "    test.append({\n",
    "        'tokens': [t.lower() for t in data['test_X'][i]],\n",
    "        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]\n",
    "    })\n",
    "    \n",
    "random.shuffle(test)\n",
    "test = pd.DataFrame(test).to_dict(orient = 'list')\n",
    "len(test['tokens'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_and_align_labels(examples):\n",
    "    tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n",
    "\n",
    "    labels = []\n",
    "    for i, label in enumerate(examples[\"ner_tags\"]):\n",
    "        word_ids = tokenized_inputs.word_ids(batch_index=i)\n",
    "        previous_word_idx = None\n",
    "        label_ids = []\n",
    "        for word_idx in word_ids:\n",
    "            if word_idx is None:\n",
    "                label_ids.append(-100)\n",
    "            elif word_idx != previous_word_idx:\n",
    "                label_ids.append(label[word_idx])\n",
    "            else:\n",
    "                label_ids.append(-100)\n",
    "            previous_word_idx = word_idx\n",
    "        labels.append(label_ids)\n",
    "\n",
    "    tokenized_inputs[\"labels\"] = labels\n",
    "    return tokenized_inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
     ]
    }
   ],
   "source": [
    "train = tokenize_and_align_labels(train)\n",
    "test = tokenize_and_align_labels(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def padding(x, y):\n",
    "    padded = tokenizer.pad([{'input_ids': x_} for x_ in x], return_tensors = 'pt')\n",
    "    sequence_length = padded['input_ids'].shape[1]\n",
    "    labels = [l + [-100] * (sequence_length - len(l)) for l in y]\n",
    "    labels = np.array(labels)\n",
    "    padded['labels'] = torch.from_numpy(labels)\n",
    "    for k in padded.keys():\n",
    "        padded[k] = padded[k].cuda()\n",
    "    return padded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainable_parameters = [param for param in model.parameters() if param.requires_grad]\n",
    "trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "400000"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train['input_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "batch_size = 5\n",
    "x = test['input_ids'][i: i + batch_size]\n",
    "y = test['labels'][i: i + batch_size]\n",
    "padded = padding(x, y)\n",
    "\n",
    "loss, pred = model(**padded)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "\n",
    "seqeval = evaluate.load(\"seqeval\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [09:17<00:00, 44.83it/s]\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: NOUN seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: AUX seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: PRON seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: VERB seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ADP seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: CCONJ seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: PROPN seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: PUNCT seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: NUM seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: DET seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ADV seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ADJ seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: SCONJ seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: SYM seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: PART seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
      "/home/husein/.local/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: X seems not to be NE tag.\n",
      "  warnings.warn('{} seems not to be NE tag.'.format(chunk))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0, loss: 0.1522444359567389, dev_predicted: 0.9406257698940625\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [09:16<00:00, 44.93it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 1, loss: 0.0923361289499607, dev_predicted: 0.9408393632416786\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [05:58<00:00, 69.79it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 2, loss: 0.06339012648392701, dev_predicted: 0.9403532155948017\n"
     ]
    }
   ],
   "source": [
    "batch_size = 16\n",
    "epoch = 100\n",
    "\n",
    "best_dev_acc = -np.inf\n",
    "patient = 1\n",
    "current_patient = 0\n",
    "\n",
    "for e in range(epoch):\n",
    "    pbar = tqdm(range(0, len(train['input_ids']), batch_size))\n",
    "    losses = []\n",
    "    for i in pbar:\n",
    "        trainer.zero_grad()\n",
    "        x = train['input_ids'][i: i + batch_size]\n",
    "        y = train['labels'][i: i + batch_size]\n",
    "        padded = padding(x, y)\n",
    "            \n",
    "        loss, pred = model(**padded)\n",
    "        loss.backward()\n",
    "        \n",
    "        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)\n",
    "        trainer.step()\n",
    "        losses.append(float(loss))\n",
    "        \n",
    "    dev_predicted = []\n",
    "    for i in range(0, len(test['input_ids']), batch_size):\n",
    "        x = test['input_ids'][i: i + batch_size]\n",
    "        y = test['labels'][i: i + batch_size]\n",
    "        padded = padding(x, y)\n",
    "        \n",
    "        loss, pred = model(**padded)\n",
    "        predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()\n",
    "        dev_predicted.extend(predictions)\n",
    "    \n",
    "    true_predictions = [\n",
    "        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(dev_predicted, test['labels'])\n",
    "    ]\n",
    "    true_labels = [\n",
    "        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(dev_predicted, test['labels'])\n",
    "    ]\n",
    "    \n",
    "    dev_predicted = seqeval.compute(predictions=true_predictions, references=true_labels)['overall_f1']\n",
    "    \n",
    "    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')\n",
    "    \n",
    "    if dev_predicted >= best_dev_acc:\n",
    "        best_dev_acc = dev_predicted\n",
    "        current_patient = 0\n",
    "        model.save_pretrained('tiny')\n",
    "    else:\n",
    "        current_patient += 1\n",
    "    \n",
    "    if current_patient >= patient:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_ = T5ForTokenClassification.from_pretrained('tiny')\n",
    "_ = model_.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'ART': {'precision': 0.8938547486033519,\n",
       "  'recall': 0.9411764705882353,\n",
       "  'f1': 0.9169054441260744,\n",
       "  'number': 170},\n",
       " 'CONJ': {'precision': 0.9713905522288756,\n",
       "  'recall': 0.9785522788203753,\n",
       "  'f1': 0.974958263772955,\n",
       "  'number': 1492},\n",
       " 'DJ': {'precision': 0.9192897497982244,\n",
       "  'recall': 0.88984375,\n",
       "  'f1': 0.9043271139341008,\n",
       "  'number': 1280},\n",
       " 'DP': {'precision': 0.9770908087220536,\n",
       "  'recall': 0.9844271412680756,\n",
       "  'f1': 0.9807452555755645,\n",
       "  'number': 3596},\n",
       " 'DV': {'precision': 0.9478672985781991,\n",
       "  'recall': 0.9523809523809523,\n",
       "  'f1': 0.9501187648456056,\n",
       "  'number': 1260},\n",
       " 'ERB': {'precision': 0.9654357459379616,\n",
       "  'recall': 0.9662921348314607,\n",
       "  'f1': 0.9658637505541599,\n",
       "  'number': 3382},\n",
       " 'ET': {'precision': 0.9603854389721628,\n",
       "  'recall': 0.9542553191489361,\n",
       "  'f1': 0.9573105656350054,\n",
       "  'number': 940},\n",
       " 'OUN': {'precision': 0.8789933694996986,\n",
       "  'recall': 0.8976608187134503,\n",
       "  'f1': 0.8882290239074159,\n",
       "  'number': 6498},\n",
       " 'RON': {'precision': 0.9888991674375578,\n",
       "  'recall': 0.9861623616236163,\n",
       "  'f1': 0.9875288683602771,\n",
       "  'number': 1084},\n",
       " 'ROPN': {'precision': 0.8842357164223751,\n",
       "  'recall': 0.8982072318444242,\n",
       "  'f1': 0.891166716912873,\n",
       "  'number': 6582},\n",
       " 'UM': {'precision': 0.9532391622016562,\n",
       "  'recall': 0.9688118811881188,\n",
       "  'f1': 0.9609624355511908,\n",
       "  'number': 2020},\n",
       " 'UNCT': {'precision': 0.9991261796574624,\n",
       "  'recall': 0.9980796089385475,\n",
       "  'f1': 0.9986026200873362,\n",
       "  'number': 5728},\n",
       " 'UX': {'precision': 1.0,\n",
       "  'recall': 0.9852941176470589,\n",
       "  'f1': 0.9925925925925926,\n",
       "  'number': 204},\n",
       " 'YM': {'precision': 0.8950617283950617,\n",
       "  'recall': 0.90625,\n",
       "  'f1': 0.9006211180124224,\n",
       "  'number': 160},\n",
       " '_': {'precision': 0.4444444444444444,\n",
       "  'recall': 0.5,\n",
       "  'f1': 0.47058823529411764,\n",
       "  'number': 16},\n",
       " 'overall_precision': 0.9370964022140221,\n",
       " 'overall_recall': 0.9446123445309775,\n",
       " 'overall_f1': 0.9408393632416786,\n",
       " 'overall_accuracy': 0.9579554043839759}"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dev_predicted = []\n",
    "for i in range(0, len(test['input_ids']), batch_size):\n",
    "    x = test['input_ids'][i: i + batch_size]\n",
    "    y = test['labels'][i: i + batch_size]\n",
    "    padded = padding(x, y)\n",
    "\n",
    "    loss, pred = model_(**padded)\n",
    "    predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()\n",
    "    dev_predicted.extend(predictions)\n",
    "\n",
    "true_predictions = [\n",
    "    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n",
    "    for prediction, label in zip(dev_predicted, test['labels'])\n",
    "]\n",
    "true_labels = [\n",
    "    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n",
    "    for prediction, label in zip(dev_predicted, test['labels'])\n",
    "]\n",
    "\n",
    "seqeval.compute(predictions=true_predictions, references=true_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e3711ef55fa8497bae94a8f5a36846a5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/84.7M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/pos-t5-tiny-standard-bahasa-cased/commit/510e71534bfd2a185b37f0f139b58122ae4be901', commit_message='Upload T5ForTokenClassification', commit_description='', oid='510e71534bfd2a185b37f0f139b58122ae4be901', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_.push_to_hub('mesolitica/pos-t5-tiny-standard-bahasa-cased', safe_serialization = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "47272e0de3164ec88044183a4b31907b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "spiece.model:   0%|          | 0.00/803k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/pos-t5-tiny-standard-bahasa-cased/commit/fd551136cfc3317249884e51ae71fa80784d5684', commit_message='Upload tokenizer', commit_description='', oid='fd551136cfc3317249884e51ae71fa80784d5684', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('mesolitica/pos-t5-tiny-standard-bahasa-cased')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
